{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch10_hypotheses.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for Chapter 10 (hypothesis testing)\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "yeVh6hm2ezCO"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "aErGXtnTezCP"
   },
   "outputs": [],
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import scipy.stats as stats\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "X7oweaU2C2kW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.2: Empirical distribution under H0"
   ],
   "metadata": {
    "id": "_6ekrLZYbbQw"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 100 # per group per sample\n",
    "numExps = 1000\n",
    "\n",
    "meandiff = np.zeros(numExps)\n",
    "\n",
    "# run the experiment\n",
    "for i in range(numExps):\n",
    "  pre = stats.truncnorm(a=-5,b=10,loc=6,scale=2).rvs(N)\n",
    "  pst = stats.truncnorm(a=-5,b=10,loc=6,scale=2).rvs(N)\n",
    "\n",
    "  meandiff[i] = np.mean(pst) - np.mean(pre)\n",
    "\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.hist(meandiff,bins=20,edgecolor='k',color='gray')\n",
    "plt.xlabel(r'Difference value ($\\Delta$)')\n",
    "plt.ylabel('Count')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_empH0.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "oNWK6PHnbdzE"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Pi2owUA971aB"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.3: Distribution assuming H0 is true"
   ],
   "metadata": {
    "id": "oV1Mx7jb71pK"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "plt.figure(figsize=(8,4))\n",
    "\n",
    "plt.hist(meandiff,bins=20,edgecolor='k',color=(.9,.9,.9))\n",
    "plt.axvline(.1,color='k',linestyle='--',linewidth=3)\n",
    "plt.axvline(.7,color='k',linestyle=':',linewidth=3)\n",
    "plt.xlabel('Difference value')\n",
    "plt.ylabel('Count')\n",
    "plt.legend([r'\"A\" ($\\Delta=.1$)',r'\"B\" ($\\Delta=.7$)','H0 dist.'])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_empH0_withAs.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "rU_MzhNqdVKy"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "kuNnaU_obd2B"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.4: Analytical vs. empirical H0 distribution"
   ],
   "metadata": {
    "id": "QZ4PN_2Pbd44"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "empirical = np.random.normal(loc=0,scale=1,size=10000)\n",
    "x = np.linspace(-4,4,1001)\n",
    "analytical = stats.norm.pdf(x) * np.diff(x[:2])\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(4,6))\n",
    "\n",
    "axs[0].plot(x,analytical,'k')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Analytical H$_0$ distribution')\n",
    "axs[0].set(xlim=[-4,4],yticks=[],ylabel='Probability')\n",
    "\n",
    "axs[1].hist(empirical,bins='fd',color=(.8,.8,.8),edgecolor='k')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Empirical H$_0$ distribution')\n",
    "axs[1].set(xlim=[-4,4],yticks=[],ylabel='Count',xlabel='Test statistic value')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_empVanalyH0.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "p9RZpfGOxXDs"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "TQ-8Ufq1xXGq"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.5: p-values and thresholds"
   ],
   "metadata": {
    "id": "Gz9VwdaIbd-L"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create a Gaussian probability curve\n",
    "zvals = np.linspace(-3,3,1001)\n",
    "zpdf  = stats.norm.pdf(zvals)\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(3,1,figsize=(7,7))\n",
    "\n",
    "# plot the probability function and the vertical lines\n",
    "axs[0].plot(zvals,zpdf,'k',linewidth=2)\n",
    "axs[0].set(xlim=zvals[[0,-1]],ylim=[0,.42],yticks=[],ylabel='Prob. given H0')\n",
    "\n",
    "\n",
    "# two-tailed p-values\n",
    "pvalsL = stats.norm.cdf(zvals[:np.argmin(zvals**2)])\n",
    "pvalsR = 1-stats.norm.cdf(zvals[np.argmin(zvals**2):])\n",
    "pvals2 = 2*np.concatenate((pvalsL,pvalsR),axis=0) # doubled for a two-tailed test\n",
    "\n",
    "# plot the probability function and the vertical lines\n",
    "for i in range(1,3):\n",
    "  axs[i].plot(zvals,pvals2,'k',linewidth=2)\n",
    "  axs[i].set(xlim=zvals[[0,-1]],ylabel='P-value')\n",
    "  axs[i].axhline(.05,color=(.5,.5,.5),linestyle='--')\n",
    "\n",
    "\n",
    "\n",
    "# draw patches for significant regions\n",
    "zidx = np.arange(np.argmin((zvals-stats.norm.ppf(.025))**2))\n",
    "axs[0].fill_between(zvals[zidx],zpdf[zidx],color='k',alpha=.4)\n",
    "axs[1].fill_between(zvals[zidx],pvals2[zidx],color='k',alpha=.4)\n",
    "axs[2].fill_between(zvals[zidx],pvals2[zidx],color='k',alpha=.4)\n",
    "\n",
    "zidx = np.arange(np.argmin((zvals-stats.norm.ppf(.975))**2),len(zvals))\n",
    "axs[0].fill_between(zvals[zidx],zpdf[zidx],color='k',alpha=.4)\n",
    "axs[1].fill_between(zvals[zidx],pvals2[zidx],color='k',alpha=.4)\n",
    "axs[2].fill_between(zvals[zidx],pvals2[zidx],color='k',alpha=.4)\n",
    "\n",
    "axs[2].axvline(x=zvals[np.argmin((zvals-stats.norm.ppf(.025))**2)],ymin=0,ymax=4.1,c=(.7,.7,.7),linestyle=':',clip_on=False)\n",
    "axs[2].axvline(x=zvals[np.argmin((zvals-stats.norm.ppf(.975))**2)],ymin=0,ymax=4.1,c=(.7,.7,.7),linestyle=':',clip_on=False)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# indicators\n",
    "axs[0].annotate('\"Significant\"',xy=(-2.2,stats.norm.pdf(-2.2)+.01),xytext=(-2.3,.2),ha='center',arrowprops={'color':'k'})\n",
    "axs[0].annotate('\"Significant\"',xy=( 2.2,stats.norm.pdf( 2.2)+.01),xytext=( 2.3,.2),ha='center',arrowprops={'color':'k'})\n",
    "axs[0].text(0,.2,'\"Non-significant\"',ha='center')\n",
    "axs[1].text(2.3,.08,'p=.05')\n",
    "axs[2].text(2.3,.065,'p=.05')\n",
    "\n",
    "\n",
    "# panel titles\n",
    "axs[0].set_title(r'$\\bf{A}$)  Test statistic distribution if H$_0$ were true')\n",
    "axs[1].set_title(r'$\\bf{B}$)  P-value for test statistic values')\n",
    "axs[2].set_title(r'$\\bf{C}$)  Same as panel $\\bf{B}$ but in log scale')\n",
    "\n",
    "\n",
    "axs[2].set(yscale='log',xlabel='Test statistic (z-score)')\n",
    "axs[1].set_ylim([0,1.03])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_sigRegionsZandP.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "mRhVtjO_qPwu"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "sM6FaGtXqPzI"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.6: H0 distribution with critical value"
   ],
   "metadata": {
    "id": "6dK99eewxXJR"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create a Gaussian probability curve\n",
    "x = np.linspace(-4,4,1001)\n",
    "gpdf = stats.norm.pdf(x)\n",
    "\n",
    "# create the figure and axis objects\n",
    "_,axs = plt.subplots(2,1,figsize=(7,5))\n",
    "\n",
    "\n",
    "# the find the indices of the 95% of the distribution\n",
    "ubndi = np.argmin(np.abs(x-stats.norm.ppf(.95)))\n",
    "\n",
    "# plot the probability function and the vertical lines\n",
    "axs[0].plot(x,gpdf,'k',linewidth=2)\n",
    "axs[0].set(xlim=x[[0,-1]],ylim=[0,.42],xticks=[],yticks=[],\n",
    "       xlabel='Test statistic',ylabel='Probability')\n",
    "\n",
    "# create patches for the significant area\n",
    "axs[0].fill_between(x[ubndi:],gpdf[ubndi:],color='k',alpha=.4)\n",
    "\n",
    "# annotations\n",
    "tailx = np.argmin(np.abs(x-2.2))\n",
    "axs[0].annotate('5%',xy=(x[tailx],gpdf[tailx]+.01),\n",
    "            xytext=(x[tailx]+1.1,gpdf[tailx]+.08),ha='center',\n",
    "            arrowprops={'color':'k'},weight='bold',size=16)\n",
    "\n",
    "# significance threshold line\n",
    "axs[0].plot([x[ubndi],x[ubndi]],[0,.4],'k--')\n",
    "axs[0].annotate('Sig. threshold',xy=[x[ubndi]+.05,.4],va='top',rotation=90,size=12)\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# the find the indices of the 2.5% and 97.5%\n",
    "lbndi = np.argmin(np.abs(x-stats.norm.ppf(.025)))\n",
    "ubndi = np.argmin(np.abs(x-stats.norm.ppf(1-.025)))\n",
    "\n",
    "\n",
    "# plot the probability function and the vertical lines\n",
    "axs[1].plot(x,gpdf,'k',linewidth=2)\n",
    "axs[1].set(xlim=x[[0,-1]],ylim=[0,.42],xticks=[],yticks=[],\n",
    "       xlabel='Test statistic',ylabel='Probability')\n",
    "\n",
    "\n",
    "# now create patches for the significant area\n",
    "axs[1].fill_between(x[:lbndi+1],gpdf[:lbndi+1],color='k',alpha=.4)\n",
    "\n",
    "# significance threshold line\n",
    "axs[1].plot([x[lbndi],x[lbndi]],[0,.4],'k--')\n",
    "axs[1].annotate('Sig. threshold',xy=[x[lbndi]+.05,.4],va='top',rotation=90,size=12)\n",
    "\n",
    "\n",
    "# repeat for the right lobe\n",
    "axs[1].fill_between(x[ubndi:],gpdf[ubndi:],color='k',alpha=.4)\n",
    "\n",
    "\n",
    "# annotations\n",
    "tailx = np.argmin(np.abs(x--2.5))\n",
    "axs[1].annotate('2.5%',xy=(x[tailx],gpdf[tailx]+.01),\n",
    "            xytext=(x[tailx]-1.1,gpdf[tailx]+.08),ha='center',\n",
    "            arrowprops={'color':'k'},weight='bold',size=16)\n",
    "tailx = np.argmin(np.abs(x-2.5))\n",
    "axs[1].annotate('2.5%',xy=(x[tailx],gpdf[tailx]+.01),\n",
    "            xytext=(x[tailx]+1.1,gpdf[tailx]+.08),ha='center',\n",
    "            arrowprops={'color':'k'},weight='bold',size=16)\n",
    "\n",
    "\n",
    "# significance threshold line\n",
    "axs[1].plot([x[ubndi],x[ubndi]],[0,.4],'k--')\n",
    "axs[1].annotate('Sig. threshold',xy=[x[ubndi]+.05,.4],va='top',rotation=90,size=12)\n",
    "\n",
    "\n",
    "\n",
    "# a few other niceties\n",
    "axs[0].axis('off')\n",
    "axs[1].axis('off')\n",
    "axs[0].set_title(r'$\\bf{A}$)  One-tailed test')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Two-tailed test')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_tails.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "u3JTp5-kLUFI"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Yl3fvY3jLUKh"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.7: Area of z>1"
   ],
   "metadata": {
    "id": "YGhGujy8LUM2"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create a Gaussian probability curve\n",
    "z = np.linspace(-4,4,1001)\n",
    "gpdf = stats.norm.pdf(z)\n",
    "\n",
    "\n",
    "_,ax = plt.subplots(1,figsize=(6,3))\n",
    "\n",
    "\n",
    "# note that the cdf returns the area *left* of the input value,\n",
    "# so we subtract 1 to get the area to the *right*.\n",
    "areaZ1 = 1-stats.norm.cdf(1)\n",
    "\n",
    "# plot the probability function and the vertical lines\n",
    "ax.plot(z,gpdf,'k',linewidth=2)\n",
    "ax.set(xlim=z[[0,-1]],ylim=[0,.42],yticks=[],\n",
    "       xlabel='Z value',ylabel='Probability')\n",
    "\n",
    "xidx = np.arange(np.argmin(np.abs(z-1)),len(z))\n",
    "ax.fill_between(z[xidx],gpdf[xidx],color='k',alpha=.4)\n",
    "\n",
    "\n",
    "\n",
    "# annotations\n",
    "tailx = np.argmin(np.abs(x-1.7))\n",
    "ax.annotate(f'{100*areaZ1:.2f}%',xy=(z[tailx],gpdf[tailx]+.01),\n",
    "            xytext=(z[tailx]+1.1,gpdf[tailx]+.08),ha='center',\n",
    "            arrowprops={'color':'k'},weight='bold',size=16)\n",
    "\n",
    "# significance threshold line\n",
    "ax.plot([1,1],[0,.4],'k--',linewidth=1.5)\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_zgt1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "9w45mbP9ujaq"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "yyUmK8rpqP1h"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10.8: p-z pairs"
   ],
   "metadata": {
    "id": "jLV0p6mMbbNn"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# critical p-values to draw\n",
    "ps2draw = [ .05,.01,.001 ]\n",
    "\n",
    "stds = np.linspace(-4,4,1001)\n",
    "prob = stats.norm.pdf(stds)\n",
    "\n",
    "_,axs = plt.subplots(2,1,figsize=(7,7))\n",
    "\n",
    "# draw the lines\n",
    "for a in axs:\n",
    "  a.plot(stds,prob,'k')\n",
    "  a.set(ylabel='Probability',yticks=[],xlim=stds[[0,-1]],ylim=[0,.42])\n",
    "\n",
    "\n",
    "\n",
    "## one-tailed\n",
    "styles = ['--',':','-.']\n",
    "for i,p in enumerate(ps2draw):\n",
    "\n",
    "  # z-value for this p-value\n",
    "  zval = stats.norm.ppf(1-p)\n",
    "\n",
    "  # vertical line\n",
    "  c = i/len(ps2draw)*.8 # line color\n",
    "  axs[0].axvline(zval,color=(c,c,c),linestyle=styles[i])\n",
    "\n",
    "  # arrow and text\n",
    "  axs[0].annotate(f'p={p}, z={zval:.2f}',xy=(zval,.2-(i-1)/10),xytext=(0,.2-(i-1)/10),color=[c,c,c],\n",
    "                  bbox=dict(fc='w',edgecolor='none'),\n",
    "                  ha='center',va='center',arrowprops={'color':(c,c,c)})\n",
    "\n",
    "\n",
    "\n",
    "## two-tailed\n",
    "for i,p in enumerate(ps2draw):\n",
    "\n",
    "  # z-value for this p-value\n",
    "  zval = stats.norm.ppf(p/2)\n",
    "\n",
    "  # vertical line\n",
    "  c = i/len(ps2draw)*.8 # line color\n",
    "  axs[1].axvline(zval,color=(c,c,c),linestyle=styles[i])\n",
    "  axs[1].axvline(-zval,color=(c,c,c),linestyle=styles[i])\n",
    "\n",
    "  # arrow and text\n",
    "  axs[1].annotate(f'p={p}, z=|{abs(zval):.2f}|',xy=(zval,.2-(i-1)/10),xytext=(0,.2-(i-1)/10),color=[c,c,c],\n",
    "                  bbox=dict(fc='w',edgecolor='none'),\n",
    "                  ha='center',va='center',arrowprops={'color':(c,c,c)})\n",
    "  axs[1].annotate('',xy=(-zval,.2-(i-1)/10),xytext=(1.15,.2-(i-1)/10),arrowprops={'color':(c,c,c)})\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "# panel labels\n",
    "axs[0].set_title(r'$\\bf{A}$)  One-tailed $p$-z pairs')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Two-tailed $p$-z pairs')\n",
    "axs[1].set(xlabel='Standard deviations (z)')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_pz_combos2know.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "-0wkvH456MHh"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "wpw_trqxa2dP"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Getting p-values from z-values"
   ],
   "metadata": {
    "id": "Lfbh8DBma2Xh"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "zval = 1\n",
    "pval = stats.norm.cdf(zval)\n",
    "\n",
    "print(f'The p-value for z = {zval:.2f} is p = {1-pval:.4f}')"
   ],
   "metadata": {
    "id": "52RsQibla4S4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "8XFEbne1a2VL"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 10:14 Spatial image for MCC"
   ],
   "metadata": {
    "id": "A1glcdDja2Sb"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Code used for Figure 10.17\n",
    "M = np.random.randn(7,7)\n",
    "M[M<.2] = 0\n",
    "\n",
    "plt.imshow(M,cmap='gray',vmin=-.3,vmax=1)\n",
    "plt.xticks([])\n",
    "plt.yticks([])\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "9SFf8TBj96DX"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lkxSs7Bn959q"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "ikJdK7YDScgp"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# No extra code here; find the relevant code cell, copy/paste that code here,\n",
    "#  and modify it to have the p-value be soft-coded. Make sure you also update\n",
    "#  the text in the figures!"
   ],
   "metadata": {
    "id": "OG7Sqg06SceF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "B_XBC4kkScbY"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "ZrDYwCNR956s"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "X = np.random.randn(20)\n",
    "p = stats.ttest_1samp(X,0)[1]\n",
    "print(f'The mean is {np.mean(X):.2f} and the p-value from a t-test is {p:.4f}')"
   ],
   "metadata": {
    "id": "xznjUVoOeqyI"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# simulate data and t-test\n",
    "alphaRange = np.linspace(.001,.9,15)\n",
    "\n",
    "M = 100\n",
    "type1errors = np.zeros((M,len(alphaRange)))\n",
    "meansAndPvals = np.zeros((M*len(alphaRange),2))\n",
    "\n",
    "# loop over alpha's\n",
    "for ai,alpha in enumerate(alphaRange):\n",
    "\n",
    "  # loop over experiments\n",
    "  for expi in range(M):\n",
    "\n",
    "    # generate the data and compute the p-value from a t-test\n",
    "    X = np.random.randn(20)\n",
    "    p = stats.ttest_1samp(X,0)[1]\n",
    "\n",
    "    # store (as Boolean) whether this test was subthreshold\n",
    "    type1errors[expi,ai] = p<alpha\n",
    "\n",
    "    # gather the mean and p-value from this test\n",
    "    meansAndPvals[expi*len(alphaRange) + ai,:] = [np.mean(X),p]\n",
    "\n",
    "\n",
    "_,axs = plt.subplots(1,2,width_ratios=[2,1],figsize=(10,4))\n",
    "axs[0].plot(alphaRange,np.mean(type1errors,axis=0),'ks',\n",
    "         markersize=13,markerfacecolor=(.8,.8,.8))\n",
    "axs[0].plot(alphaRange,alphaRange,'k--',zorder=-1)\n",
    "axs[0].set(xlabel='Predicted FA proportion',ylabel='Observed FA proportion')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Expected vs. empirical false alarm rate')\n",
    "\n",
    "axs[1].plot(meansAndPvals[:,0],meansAndPvals[:,1],'k.')\n",
    "axs[1].set(xlabel='Sample means',ylabel='P-values',ylim=[0,1])\n",
    "axs[1].set_title(r'$\\bf{B}$)  Means and p-values')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_ex2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "AAh0OX_h953Y"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# Note about the previous code cell:\n",
    "#  You can implement many tests simultaneously in a matrix, with columns containing experiments.\n",
    "#  I'll introduce this in Chapter 12, but I show the code here FYI.\n",
    "\n",
    "type1errors = np.zeros(len(alphaRange)) # note: vector not matrix!\n",
    "for ai,alpha in enumerate(alphaRange):\n",
    "  p = stats.ttest_1samp(np.random.randn(20,M),0)[1] # compare input to that in previous cell\n",
    "  type1errors[ai] = np.mean(p<alpha) # here is the averaging\n",
    "\n",
    "# plot\n",
    "plt.plot(alphaRange,type1errors,'ks',markersize=13,markerfacecolor=(.8,.8,.8))\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "tYM64DO_a2O-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "BVeyYbIz_cwi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "DVX544Bo_cs5"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# one t-value distribution\n",
    "tvals = np.linspace(-4,4,1001)\n",
    "\n",
    "# compute the pdf\n",
    "tpdf = stats.t.pdf(tvals,20)\n",
    "\n",
    "plt.plot(tvals,tpdf)\n",
    "plt.xlim(tvals[[0,-1]])\n",
    "plt.xlabel('t value')\n",
    "plt.ylabel('Probability')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "5zvT7s3KzuiF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# t-distributions with different df's\n",
    "tvals = np.linspace(-4,4,1001)\n",
    "dfs = np.arange(4,41)\n",
    "\n",
    "# initialize the figure\n",
    "plt.figure(figsize=(8,4))\n",
    "\n",
    "# create and show the pdf's\n",
    "for df in dfs:\n",
    "\n",
    "  # compute the pdf\n",
    "  tpdf = stats.t.pdf(tvals,df) * np.diff(tvals[:2])\n",
    "\n",
    "  # optional: for a nice visual effect, you can shift the pdf's:\n",
    "  #tpdf += df/100000\n",
    "\n",
    "  # plot\n",
    "  c = (df-np.min(dfs))/np.max(dfs) # color value with scaling\n",
    "  plt.plot(tvals,tpdf,color=(c,c,c))\n",
    "\n",
    "\n",
    "# then plot zscores (using \"tvals\" as z-values here)\n",
    "plt.plot(tvals,stats.norm.pdf(tvals)*np.diff(tvals[:2]),'r--',linewidth=2)\n",
    "\n",
    "\n",
    "plt.ylim([0,np.max(tpdf)*1.02])\n",
    "plt.xlim(tvals[[0,-1]])\n",
    "plt.xlabel('t or z value')\n",
    "plt.ylabel('Probability')\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "jxgp-JXdA2hj"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "5TuZGrEV_cqA"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "ey-WydDCQHGJ"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "from statsmodels.stats.multitest import fdrcorrection\n",
    "\n",
    "# p-value threshold (corrected)\n",
    "pThresh = .05\n",
    "\n",
    "# set of p-values\n",
    "k = 40\n",
    "pvals = np.random.uniform(low=.001,high=.3,size=k)**2\n",
    "\n",
    "# step 1: sort the p-values\n",
    "pvalsSort = np.sort(pvals)\n",
    "\n",
    "# step 2: linear interpolated distribution\n",
    "pvalsInterp = np.arange(1,k+1) / k\n",
    "\n",
    "# step 3: adjusted p-values\n",
    "pvals_adjusted = pvalsSort / pvalsInterp\n",
    "\n",
    "# final step implemented in fdrcorrection() is to take the running-minimum of sorted adjusted p-values.\n",
    "#pvals_adjusted = np.minimum.accumulate(pvals_adjusted[::-1])[::-1]\n",
    "\n",
    "\n",
    "# Using the statsmodel function.\n",
    "# This function returns a tuple with (0) Boolean rejections and (1) adjusted p-values.\n",
    "# Here we need only the second output.\n",
    "qq = fdrcorrection(pvalsSort,pThresh)[1]\n",
    "# Also note that I'm inputting pvalsSort instead of pvals. That's done to facilitate the\n",
    "# plotting; you would normally input the vector of p-values without sorting.\n",
    "\n",
    "\n",
    "\n",
    "## visualization!\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(qq,'ko',markerfacecolor=(.3,.3,.3),markersize=8,label='fdrcorrection()')\n",
    "plt.plot(pvals_adjusted,'k^',markerfacecolor=(.6,.6,.6),markersize=8,label='Manual')\n",
    "plt.plot(pvalsSort,'ks',markerfacecolor=(.9,.9,.9),markersize=8,label='Raw p-values')\n",
    "plt.plot(pThresh*pvalsInterp,color='gray',label=r'Rejection line ($\\alpha\\bf{v}$/k)',zorder=-10)\n",
    "plt.axhline(pThresh,linestyle='--',color=(.8,.8,.8),zorder=-10)\n",
    "plt.text(0,.052,'p=.05')\n",
    "\n",
    "# cross out non-significant p-values\n",
    "plt.plot(np.where(pvals_adjusted>pThresh)[0],pvalsSort[pvals_adjusted>pThresh],'kx')\n",
    "\n",
    "# final niceties\n",
    "plt.legend(bbox_to_anchor=[.75,.46])\n",
    "plt.xlabel('Sorted index')\n",
    "plt.ylabel('P-value')\n",
    "plt.xlim([-1,k])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_ex4.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "5bpZFbXDQHDQ"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "GNhINllXhEYd"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 5"
   ],
   "metadata": {
    "id": "cHFdaD9JhEVr"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# find the p-value threshold in the non-adjusted p-values\n",
    "\n",
    "# the p-values that are significant according to FDR correction\n",
    "H0rejected = pvalsSort <= pvalsInterp*pThresh\n",
    "\n",
    "# find the largest signficant (raw) pvalue\n",
    "H0rejected_pvals = np.where(H0rejected)[0]\n",
    "FDR_threshold = pvalsSort[H0rejected_pvals[-1]]\n",
    "\n",
    "print(f'Uncorrected p-value threshold based on FDR: q={FDR_threshold:.4f}')"
   ],
   "metadata": {
    "id": "r2nEyhYWbisc"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lRSbg1mUQG1g"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "06CDCkDI67BT"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 100\n",
    "\n",
    "# p-values\n",
    "pvals = np.random.uniform(.001,.25,N)**2\n",
    "\n",
    "# thresholds\n",
    "bon_thresh = .05/N\n",
    "q = fdrcorrection(pvals,.05)\n",
    "\n",
    "# print messages\n",
    "print(f'       FDR led to {100*np.mean(q[0]):2.0f}% significant tests.')\n",
    "print(f'Bonferroni led to {100*np.mean(pvals<bon_thresh):2.0f}% significant tests.')"
   ],
   "metadata": {
    "id": "LsbKXd0069xU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# Experiment repetitions with the same N=100\n",
    "\n",
    "sigTests = np.zeros((100,2))\n",
    "\n",
    "for expi in range(100):\n",
    "\n",
    "  pvals = np.random.uniform(.001,.25,N)**2\n",
    "  bon_thresh = .05/N\n",
    "  q = fdrcorrection(pvals,.05)\n",
    "\n",
    "  # record the results\n",
    "  sigTests[expi,0] = 100*np.mean(q[0])\n",
    "  sigTests[expi,1] = 100*np.mean(pvals<bon_thresh)\n",
    "\n",
    "\n",
    "# report the average and std\n",
    "print(f'       FDR: mean of {np.mean(sigTests[:,0]):5.2f}%  (std: {np.std(sigTests[:,0],ddof=1):.2f}) significant tests.')\n",
    "print(f'Bonferroni: mean of {np.mean(sigTests[:,1]):5.2f}%  (std: {np.std(sigTests[:,1],ddof=1):.2f}) significant tests.')"
   ],
   "metadata": {
    "id": "OC6GMWPr69zr"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "1C4uWqoQ1A_J"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 7"
   ],
   "metadata": {
    "id": "AFtqHJX31A6x"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# You need to have run the code for exercise 6 before this code.\n",
    "\n",
    "# the p-value set size\n",
    "Ns = np.logspace(np.log10(2),np.log10(500),25,dtype=int)\n",
    "\n",
    "# number of experiment repetitions\n",
    "nRepetitions = 100\n",
    "\n",
    "# results matrix (note: not storing the result of each repetition)\n",
    "sigTests = np.zeros((len(Ns),3))\n",
    "\n",
    "\n",
    "# and away we go!\n",
    "for ni,n in enumerate(Ns):\n",
    "\n",
    "  # loop over experiment repetitions\n",
    "  for _ in range(nRepetitions): # we don't need the counting variable here...\n",
    "\n",
    "    # p-values and corrections\n",
    "    pvals = np.random.uniform(.001,.25,n)**2\n",
    "    bon_thresh = .05/n\n",
    "    q = fdrcorrection(pvals,.05)\n",
    "\n",
    "    # record the results (note the summation)\n",
    "    sigTests[ni,0] += 100*np.mean(pvals<.05)        # uncorrected\n",
    "    sigTests[ni,1] += 100*np.mean(q[0])             # FDR-corrected\n",
    "    sigTests[ni,2] += 100*np.mean(pvals<bon_thresh) # Bonferroni-corrected\n",
    "\n",
    "\n",
    "# the code above keeps summing, so now divide by M repetitions to average\n",
    "sigTests /= nRepetitions\n",
    "\n",
    "\n",
    "# now for the visualization\n",
    "plt.figure(figsize=(7,4))\n",
    "plt.plot(Ns,sigTests[:,0],'k^',markersize=9,markerfacecolor=(.2,.2,.2),label='Uncorrected')\n",
    "plt.plot(Ns,sigTests[:,1],'ko',markersize=9,markerfacecolor=(.5,.5,.5),label='FDR')\n",
    "plt.plot(Ns,sigTests[:,2],'ks',markersize=9,markerfacecolor=(.8,.8,.8),label='Bonferroni')\n",
    "\n",
    "plt.legend()\n",
    "plt.xlabel('Number of p-values')\n",
    "plt.ylabel('Average % sub-threshold p-vals')\n",
    "plt.xlim([Ns[0]-8,Ns[-1]+8])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_ex7.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "Q4TCuNCw6919"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# In case you were wondering: the motivation for logarithmic scaling of set size\n",
    "# is the most of the interesting action happens with small samples. You can try\n",
    "# using a linear increase, or try setting the x-axis scale to be logarithmic."
   ],
   "metadata": {
    "id": "CeKFzG8-66zO"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "uModnr7g66wr"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 8"
   ],
   "metadata": {
    "id": "mDczjDjbc7lf"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "sampleSizeSkip = 3\n",
    "sampleSizeMax  = 201\n",
    "\n",
    "# initialize data variable and output vector\n",
    "data = np.random.randn(5)\n",
    "pvals = []\n",
    "ssizes = []\n",
    "\n",
    "while len(data)<sampleSizeMax:\n",
    "  # compute the p-value and sample sizes\n",
    "  pvals.append( stats.ttest_1samp(data,0).pvalue )\n",
    "  ssizes.append( len(data) )\n",
    "\n",
    "  # add more data!\n",
    "  data = np.append(data,np.random.randn(sampleSizeSkip))\n",
    "\n",
    "\n",
    "plt.figure(figsize=(8,4))\n",
    "\n",
    "plt.plot(ssizes,pvals,'ko',markersize=10,markerfacecolor=[.7,.7,.7])\n",
    "plt.axhline(y=.05,color='k',linestyle='--')\n",
    "plt.ylim([0,1.05])\n",
    "plt.xlabel('Sample sizes')\n",
    "plt.ylabel('P-value')\n",
    "plt.title('P-values in random Gaussian numbers',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('hyp_ex8.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "XVRQqh-zao1Z"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "yeelS6ZiaojQ"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}